In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import random
In [2]:
episode_comment = pd.read_csv("data/webnovel/episode_comments.csv", index_col=0, encoding="cp949")
In [ ]:
episode_comment["ID"] = episode_comment["object_id"].apply(lambda x: x.split("-")[0])
episode_comment["volume"] = episode_comment["object_id"].apply(lambda x: x.split("-")[1]).astype("int")
episode_comment["writer_nickname"].fillna("", inplace=True)
def make_user_id(i):
if episode_comment["writer_nickname"].loc[i] == "":
return episode_comment["writer_ip"].loc[i] + episode_comment["writer_id"].loc[i]
else:
return episode_comment["writer_nickname"].loc[i] + episode_comment["writer_id"].loc[i]
user_id = [
make_user_id(i)
for i in range(len(episode_comment))
]
episode_comment["user_id"] = user_id
episode_comment.drop(
[
"contents",
"down_count",
"modified_ymdt",
"registered_ymdt",
"ticket",
"up_count",
"writer_ip",
"writer_id",
"writer_nickname",
"writer_profile_type",
"object_id",
],
axis=1,
inplace=True
)
episode_comment.head()
In [238]:
main_comment = pd.read_csv("data/webnovel/main_comments.csv", index_col=0, encoding="cp949")
In [239]:
main_comment["ID"] = main_comment["object_id"].apply(lambda x: x.split("-")[1])
main_comment["volume"] = 0
main_comment["writer_nickname"].fillna("", inplace=True)
def make_user_id(i):
if main_comment["writer_nickname"].loc[i] == "":
return main_comment["writer_ip"].loc[i] + main_comment["writer_id"].loc[i]
else:
return main_comment["writer_nickname"].loc[i] + main_comment["writer_id"].loc[i]
user_id = [
make_user_id(i)
for i in range(len(main_comment))
]
main_comment["user_id"] = user_id
main_comment.drop(
[
"contents",
"down_count",
"modified_ymdt",
"registered_ymdt",
"ticket",
"up_count",
"writer_ip",
"writer_id",
"writer_nickname",
"writer_profile_type",
"object_id",
],
axis=1,
inplace=True
)
main_comment.head()
Out[239]:
In [269]:
user_df = pd.concat([episode_comment, main_comment]).groupby(["user_id", "ID"], as_index=False).agg({"volume":np.size})
In [270]:
len(user_df)
Out[270]:
In [271]:
df = pd.read_csv("data/webnovel/main_df.csv", encoding="cp949", index_col=0)
df["ID"] = df["ID"].astype("str")
In [243]:
df = user_df.merge(df, on="ID")[["user_id", "genre", "volume"]].drop_duplicates()
In [244]:
len(df["user_id"].unique())
Out[244]:
In [245]:
romance = df[df["genre"] == 101]
In [246]:
no_romance = df[df["genre"] != 101]
In [247]:
len(romance.merge(no_romance, on="user_id"))
Out[247]:
In [248]:
user_size = len(user_df["user_id"].unique())
In [249]:
users = user_df["user_id"].unique()
In [250]:
users_index = {
user:index
for index, user in enumerate(users)
}
In [266]:
book_df = pd.read_csv("data/webnovel/main_df.csv", encoding="cp949", index_col=0)
In [264]:
book_size = len(book_df.ID.unique())
In [253]:
books = book_df.ID.unique()
In [262]:
len(books)
Out[262]:
In [254]:
books_index = {
str(book):index
for index, book in enumerate(books)
}
In [255]:
user_df["book_index"] = user_df["ID"].apply(lambda x: books_index[x])
In [256]:
user_df["user_index"] = user_df["user_id"].apply(lambda x: users_index[x])
In [257]:
empty_matrix = np.zeros((user_size, book_size))
In [259]:
for index, i in user_df.iterrows():
empty_matrix[i["user_index"], i["book_index"]] = i["volume"]
In [265]:
user_book_matrix = pd.DataFrame(empty_matrix, columns=books)
In [22]:
user_book_matrix.index = users
In [23]:
user_book_matrix
Out[23]:
In [335]:
for i in range(15):
print(i+1, "권 이상 읽은 사람은",len(user_book_matrix[user_book_matrix.sum(axis=1)>i]), "명 입니다.")
In [179]:
from scipy.spatial import distance
def cosine_distance(a, b):
return 1 - distance.cosine(a, b)
In [216]:
def make_score(books):
"""
MAE 스코어 계산
"""
user_books_matrix_two = user_book_matrix[user_book_matrix.sum(axis=1)>books]
empty_matrix = np.zeros((50, len(user_books_matrix_two))) # 샘플 10명
users_two_index = user_books_matrix_two.index
user_books_matrix_two.index = range(len(user_books_matrix_two))
for index_1, i in user_books_matrix_two[:10].iterrows():
for index_2, j in user_books_matrix_two[index_1+1:].iterrows():
empty_matrix[index_1, index_2] = cosine_distance(i, j)
score_list = []
for i in range(10):
ID_index = []
while len(ID_index) < 11:
if empty_matrix[i].argmax() >= 1:
empty_matrix[i, empty_matrix[i].argmax()] = 0
else:
ID_index.append(empty_matrix[i].argmax())
empty_matrix[i, empty_matrix[i].argmax()] = 0
data = user_books_matrix_two.loc[i]
predict = user_books_matrix_two.loc[ID_index].mean()
score = data[data > 0] - predict[data > 0]
score_list.append(np.absolute(score).sum()/len(score))
print(np.array(score_list).mean())
return np.array(score_list).mean()
In [218]:
scores = list(map(make_score, [0,1,2,3,4,5,6,7,8,9]))
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [52]:
user_df[user_df["user_id"] == users_two_index[empty_matrix[0].argmax()]]
Out[52]:
In [53]:
user_df[user_df["user_id"] == users_two_index[0]]
Out[53]:
In [42]:
user_books_matrix_two
Out[42]:
In [ ]: